from pathlib import Path
import urllib.request
import pandas as pd
url = 'https://s3.amazonaws.com/datashader-data/nyc_taxi_wide.parq'
dfile = Path('large-data/nyc_taxi_wide.parq')
if not dfile.is_file():
urllib.request.urlretrieve(url, dfile)
usecols = ['dropoff_x','dropoff_y','pickup_x','pickup_y',
'dropoff_hour','pickup_hour','passenger_count',
'tpep_pickup_datetime', 'tpep_dropoff_datetime']
df = pd.read_parquet(dfile, engine='fastparquet')[usecols]
df['weekday'] = df['tpep_pickup_datetime'].dt.dayofweek
df.head()
| dropoff_x | dropoff_y | pickup_x | pickup_y | dropoff_hour | pickup_hour | passenger_count | tpep_pickup_datetime | tpep_dropoff_datetime | weekday | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8234835.5 | 4975627.0 | -8236963.0 | 4975552.5 | 19 | 19 | 1 | 2015-01-15 19:05:39 | 2015-01-15 19:23:42 | 3 |
| 1 | -8237020.5 | 4976875.0 | -8237826.0 | 4971752.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:53:28 | 5 |
| 2 | -8232279.0 | 4986477.0 | -8233561.5 | 4983296.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:43:41 | 5 |
| 3 | -8238124.0 | 4971127.0 | -8238654.0 | 4970221.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:35:31 | 5 |
| 4 | -8238107.5 | 4974457.0 | -8234433.5 | 4977363.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:52:58 | 5 |
pickup_hour_counts = df.groupby('pickup_hour').size()
pickup_hour_counts
pickup_hour
0 422395
1 318624
2 241031
3 176482
4 123661
5 106368
6 237927
7 423407
8 531625
9 551687
10 539556
11 567284
12 604782
13 599189
14 614862
15 602741
16 535178
17 626922
18 753750
19 756882
20 683381
21 657564
22 629954
23 536842
dtype: int64
Since our data is large, and we are interested in planning, its a good idea to group by pickup hour and cluster independently for different hours. Lets choose 18, as it is the end of shift for most people. We still get around a million points.
df_18 = df[(df.pickup_hour == 18) & (df.weekday < 5)][['pickup_x', 'pickup_y']]
len(df_18)
537710
Lets first visualize the density with datashader, we would like a plot per hour eventually, for weekdays, weekends as well.
df.tpep_pickup_datetime.max()
Timestamp('2015-01-31 23:59:59')
df.tpep_pickup_datetime.min()
Timestamp('2015-01-01 00:00:00')
For monthly rides, we define a minimum cluster size as to have at least 100 rides.
df_18['pickup_y'].max()
4988749.5
import holoviews as hv
from holoviews.element.tiles import OSM
from holoviews.operation.datashader import datashade, dynspread
import colorcet as cc
import datashader as ds
hv.extension('bokeh')
x_range, y_range =(-8242000,-8220000), (4966000,4986000)
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
"#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
"#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F"]
tiles = OSM().opts(alpha=0.5, width=900, height=500).redim.range(x=x_range, y=y_range)
points = hv.Points(df, ['pickup_x', 'pickup_y'])
shader = datashade(points, color_key=colors, normalization='eq_hist', aggregator=ds.count_cat('pickup_hour'))
tiles * dynspread(shader, threshold=0.3, max_px=4)
from datashader.colors import Hot
x_range, y_range =(-8242000,-8220000), (4966000,4986000)
tiles = OSM().opts(alpha=0.5, width=900, height=500).redim.range(x=x_range, y=y_range)
points = hv.Points(df, ['pickup_x', 'pickup_y'])
shader = datashade(points, cmap=Hot, normalization='eq_hist')
tiles * dynspread(shader, threshold=0.3, max_px=4)
To perform clustering we have two options, work in radians and use the great circle distance, which is more natural, or project onto UTM and use coordinates in meters, which makes the parameters easier to interpret. We will project into UTM, as working in meters should also be faster, as euclidian distance is faster to compute than the great circle distance.
Alternatively, try using radians, since the conversion is easy.
df.head()
| dropoff_x | dropoff_y | pickup_x | pickup_y | dropoff_hour | pickup_hour | passenger_count | tpep_pickup_datetime | tpep_dropoff_datetime | weekday | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8234835.5 | 4975627.0 | -8236963.0 | 4975552.5 | 19 | 19 | 1 | 2015-01-15 19:05:39 | 2015-01-15 19:23:42 | 3 |
| 1 | -8237020.5 | 4976875.0 | -8237826.0 | 4971752.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:53:28 | 5 |
| 2 | -8232279.0 | 4986477.0 | -8233561.5 | 4983296.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:43:41 | 5 |
| 3 | -8238124.0 | 4971127.0 | -8238654.0 | 4970221.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:35:31 | 5 |
| 4 | -8238107.5 | 4974457.0 | -8234433.5 | 4977363.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:52:58 | 5 |
We first need lat lon.
from pyproj import Transformer
# Original projection: epsg:3857 (Web Mercator)
# Lat-lon projection: epsg:4326
transformer = Transformer.from_crs(3857, 4326)
df["pickup_lat"], df["pickup_lon"] = transformer.transform(df['pickup_x'], df['pickup_y'])
#df['pickup_x_rad'] = np.radians(df['pickup_x'])
df['pickup_x_rad'] = np.radians(df['pickup_lon'])
df['pickup_y_rad'] = np.radians(df['pickup_lat'])
df.head()
| dropoff_x | dropoff_y | pickup_x | pickup_y | dropoff_hour | pickup_hour | passenger_count | tpep_pickup_datetime | tpep_dropoff_datetime | weekday | pickup_x_rad | pickup_y_rad | pickup_lat | pickup_lon | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8234835.5 | 4975627.0 | -8236963.0 | 4975552.5 | 19 | 19 | 1 | 2015-01-15 19:05:39 | 2015-01-15 19:23:42 | 3 | -1.291437 | 0.711224 | 40.750110 | -73.993898 |
| 1 | -8237020.5 | 4976875.0 | -8237826.0 | 4971752.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:53:28 | 5 | -1.291572 | 0.710772 | 40.724245 | -74.001650 |
| 2 | -8232279.0 | 4986477.0 | -8233561.5 | 4983296.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:43:41 | 5 | -1.290904 | 0.712143 | 40.802789 | -73.963341 |
| 3 | -8238124.0 | 4971127.0 | -8238654.0 | 4970221.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:35:31 | 5 | -1.291702 | 0.710590 | 40.713817 | -74.009088 |
| 4 | -8238107.5 | 4974457.0 | -8234433.5 | 4977363.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:52:58 | 5 | -1.291041 | 0.711439 | 40.762430 | -73.971175 |
df_18 = df[(df.pickup_hour == 18) & (df.weekday < 5)][['pickup_x_rad', 'pickup_y_rad']]
It seems we need UTM anyway, since haversine distance breaks for many points.
from pyproj.aoi import AreaOfInterest
from pyproj.database import query_utm_crs_info
minx, miny, maxx, maxy = min(df['pickup_lon']), min(df['pickup_lat']), max(df['pickup_lon']), max(df['pickup_lat'])
x_center = np.mean([minx, maxx])
y_center = np.mean([miny, maxy])
utm_crs_list = query_utm_crs_info(
datum_name="WGS 84",
area_of_interest=AreaOfInterest(
west_lon_degree=x_center,
south_lat_degree=y_center,
east_lon_degree=x_center,
north_lat_degree=y_center,
),
)
utm_crs_list
[CRSInfo(auth_name='EPSG', code='32618', name='WGS 84 / UTM zone 18N', type=<PJType.PROJECTED_CRS: 'PROJECTED_CRS'>, deprecated=False, area_of_use=AreaOfUse(west=-78.0, south=0.0, east=-72.0, north=84.0, name='Between 78°W and 72°W, northern hemisphere between equator and 84°N, onshore and offshore. Bahamas. Canada - Nunavut; Ontario; Quebec. Colombia. Cuba. Ecuador. Greenland. Haiti. Jamica. Panama. Turks and Caicos Islands. United States (USA). Venezuela.'), projection_method_name='Transverse Mercator')]
utm_crs_list[0].code
'32618'
transformer = Transformer.from_crs(3857, 32618, always_xy=True)
df["pickup_x_utm"], df["pickup_y_utm"] = transformer.transform(df['pickup_x'], df['pickup_y'])
df.head()
| dropoff_x | dropoff_y | pickup_x | pickup_y | dropoff_hour | pickup_hour | passenger_count | tpep_pickup_datetime | tpep_dropoff_datetime | weekday | pickup_x_rad | pickup_y_rad | pickup_lat | pickup_lon | pickup_x_utm | pickup_y_utm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8234835.5 | 4975627.0 | -8236963.0 | 4975552.5 | 19 | 19 | 1 | 2015-01-15 19:05:39 | 2015-01-15 19:23:42 | 3 | -1.291437 | 0.711224 | 40.750110 | -73.993898 | 584934.173480 | 4.511504e+06 |
| 1 | -8237020.5 | 4976875.0 | -8237826.0 | 4971752.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:53:28 | 5 | -1.291572 | 0.710772 | 40.724245 | -74.001650 | 584312.360513 | 4.508626e+06 |
| 2 | -8232279.0 | 4986477.0 | -8233561.5 | 4983296.5 | 20 | 20 | 1 | 2015-01-10 20:33:38 | 2015-01-10 20:43:41 | 5 | -1.290904 | 0.712143 | 40.802789 | -73.963341 | 587444.628700 | 4.517382e+06 |
| 3 | -8238124.0 | 4971127.0 | -8238654.0 | 4970221.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:35:31 | 5 | -1.291702 | 0.710590 | 40.713817 | -74.009088 | 583697.255255 | 4.507461e+06 |
| 4 | -8238107.5 | 4974457.0 | -8234433.5 | 4977363.0 | 20 | 20 | 1 | 2015-01-10 20:33:39 | 2015-01-10 20:52:58 | 5 | -1.291041 | 0.711439 | 40.762430 | -73.971175 | 586836.413913 | 4.512894e+06 |
import hdbscan
from collections import Counter
import matplotlib.pyplot as plt
df_18 = df[(df.pickup_hour == 18) & (df.weekday < 5)][['pickup_x_rad', 'pickup_y_rad']]
epsilon = 5 # calculate 5 meter epsilon threshold
clusterer = hdbscan.HDBSCAN(min_cluster_size=100, min_samples=200, cluster_selection_method = 'eom')
#cluster_selection_epsilon=epsilon, cluster_selection_method = 'eom')
clusterer.fit(df_18)
cluster_sizes = Counter(clusterer.labels_)
len(cluster_sizes)
363
len(cluster_sizes)/len(df_18)
0.0013055364415763143
df_18 = df[(df.pickup_hour == 18) & (df.weekday < 5)].copy()
df_18['labels'] = clusterer.labels_
x_range, y_range =(-8242000,-8220000), (4966000,4986000)
colors = ["#FF0000","#FF3F00","#FF7F00","#FFBF00","#FFFF00","#BFFF00","#7FFF00","#3FFF00",
"#00FF00","#00FF3F","#00FF7F","#00FFBF","#00FFFF","#00BFFF","#007FFF","#003FFF",
"#0000FF","#3F00FF","#7F00FF","#BF00FF","#FF00FF","#FF00BF","#FF007F","#FF003F"]*21
tiles = OSM().opts(alpha=0.5, width=900, height=500).redim.range(x=x_range, y=y_range)
points = hv.Points(df_18, ['pickup_x', 'pickup_y'])
shader = datashade(points, color_key=colors, normalization='eq_hist', aggregator=ds.count_cat('labels'))
tiles * dynspread(shader, threshold=0.3, max_px=4)
To visualize the clusters we will find their convex hull.
# Get larger cluster to test convex hull routine
cluster_sizes.most_common()[1]
(605, 11004)
from shapely.geometry import MultiPoint
df_18[df_18.labels == 107][['pickup_x', 'pickup_y']].values[:10]
array([[-8232471. , 4978539.5],
[-8232351. , 4978783. ],
[-8233318.5, 4977117. ],
[-8233037.5, 4977627.5],
[-8233976.5, 4975987.5],
[-8232009. , 4979531.5],
[-8233475.5, 4976784. ],
[-8231653. , 4980033. ],
[-8232888. , 4977898. ],
[-8233046. , 4977559. ]], dtype=float32)
mp = MultiPoint(df_18[df_18.labels == 107][['pickup_x', 'pickup_y']].values)
np.ravel(mp.convex_hull.centroid.xy)
array([-8232769.83705982, 4978092.02171123])
c_hulls = []
polys = []
centroids = []
for k in np.unique(clusterer.labels_):
if k == -1: continue
hull = MultiPoint(df_18[df_18.labels == k][['pickup_x', 'pickup_y']].values).convex_hull
centroids.append(np.ravel(hull.centroid.xy))
x, y = hull.exterior.xy
polys.append({'x': x, 'y': y, 'value': k})
c_hulls.append(hull)
len(c_hulls)
496
x_range, y_range =(-8242000,-8220000), (4966000,4986000)
tiles = OSM().opts(alpha=0.5, width=900, height=500).redim.range(x=x_range, y=y_range)
points = hv.Points(df_18[df_18.labels != -1], ['pickup_x', 'pickup_y'])
points.opts(color='k')
polys1 = hv.Polygons(polys)
shader = datashade(points, cmap=Hot)
tiles * polys1 * points
from alphashape import alphashape
c_hulls[0]
mp = MultiPoint(df_18[df_18.labels == 0][['pickup_x', 'pickup_y']].values)
ash = alphashape(mp, alpha=0.004)
ash
WARNING:root:Singular matrix. Likely caused by all points lying in an N-1 space.
len(mp)
439